###load targeted normal read counts
targeted_normal_reads <- read.table("../Gaudin_thesis_data/97_sample_st.dat", stringsAsFactors = F, header = TRUE, sep=",")
##format targeted normal reads tables -- [[1]]: VAF frequency [[2]] coverage at SNP site
targeted_normal_read_table <- format_het_table(targeted_normal_reads,25,75)
##indices of normal samples run on NovaSeq 6000
nova_indices <- readRDS("../Gaudin_thesis_data/nova_indices.rds")
##only include NovaSeq normal samples
targeted_normal_read_table[[1]] <- targeted_normal_read_table[[1]][,c(c(1,2),c(nova_indices + 2))] ##nova_indices + 2 accounts for chromosome and position columns
targeted_normal_read_table[[2]] <- targeted_normal_read_table[[2]][,c(c(1,2),c(nova_indices + 2))]
st_alt_table <- targeted_normal_read_table[[1]][,3:ncol(targeted_normal_read_table[[1]])] ## remove chromosome and position
st_cov_table <- targeted_normal_read_table[[2]][,3:ncol(targeted_normal_read_table[[2]])] ## remove chromosome and position
avg_alts <- c()
avg_covs <- c()
for(i in 1:nrow(st_alt_table)){
indices <- which(st_alt_table[i,] > 25 & st_alt_table[i,] < 75 & st_cov_table[i,] > 100) ##only include SNP sites with VAF frequency between 25 - 75 and coverage > 100,
if(length(indices) > 7){ ##only average SNPs that are heterozygous at at least 7 SNP sites
avg_alts <- c(avg_alts, mean(as.numeric(st_alt_table[i,indices])))
avg_covs <- c(avg_covs, mean(as.numeric(st_cov_table[i,indices])))
}
}
cov_df <- data.frame(Average_vaf = avg_alts, Average_coverage = avg_covs)
cov_df <- cov_df[!is.na(cov_df$Average_vaf),]
print("number of SNPs included")
## [1] "number of SNPs included"
print(nrow(cov_df))## print number of SNPs
## [1] 551
print(ggplot(cov_df, aes(x=Average_coverage, y=Average_vaf)) + xlab("SNP Site Average Coverage") + ylab("Mean VAF for SNP") + geom_point() + geom_vline(aes(xintercept=250), colour="#BB0000", linetype="dashed"))
gene_table <- read.table("GENES_chrom_start_stop.txt", header=TRUE)
##adjust for targeting of genes for +/- 1MB , but dont overlap MSH2 and MSH6
gene_table$Start[-c(4)] <- gene_table$Start[-c(4)] - 1000000
gene_table$Stop[-c(3)] <- gene_table$Stop[-c(3)] + 1000000
print(gene_table)
## GENE Chromosome Start Stop
## 1 BRCA2 13 31889611 33973809
## 2 BRCA1 17 40196312 42277500
## 3 MSH2 2 46630108 47789450
## 4 MSH6 2 47922669 49037240
## 5 MLH1 3 36034823 38107380
## 6 PMS2 7 5012870 7048756
## 7 CDK4 12 57141510 59149796
## 8 MDM2 12 68201956 70239214
## 9 ERBB2 17 36844167 38886697
## 10 EGFR 7 54086714 56324313
## 11 MET 7 115312444 117438440
## 12 MYC 8 127747680 129753680
normal_targeted_het_counts <- gene_het_count_table(gene_table,targeted_normal_read_table,25,75,100)
for(i in 1:ncol(normal_targeted_het_counts)){
print(hist(normal_targeted_het_counts[,i], breaks = max(normal_targeted_het_counts[,i]), main=paste("Heterozygous SNPs for Each Sample (n = 47) Targeting", colnames(normal_targeted_het_counts)[i]), xlab="Number of Heterozygous SNPs", ylab="Number of Samples"))
}
## $breaks
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
##
## $counts
## [1] 7 2 2 0 0 4 3 2 1 3 4 2 1 4 4 1 7
##
## $density
## [1] 0.14893617 0.04255319 0.04255319 0.00000000 0.00000000 0.08510638
## [7] 0.06382979 0.04255319 0.02127660 0.06382979 0.08510638 0.04255319
## [13] 0.02127660 0.08510638 0.08510638 0.02127660 0.14893617
##
## $mids
## [1] 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5 9.5 10.5 11.5 12.5 13.5
## [15] 14.5 15.5 16.5
##
## $xname
## [1] "normal_targeted_het_counts[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
## $breaks
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
##
## $counts
## [1] 19 2 2 2 3 0 0 0 0 0 0 0 0 0 1 0 1 0 1 16
##
## $density
## [1] 0.40425532 0.04255319 0.04255319 0.04255319 0.06382979 0.00000000
## [7] 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## [13] 0.00000000 0.00000000 0.02127660 0.00000000 0.02127660 0.00000000
## [19] 0.02127660 0.34042553
##
## $mids
## [1] 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5 9.5 10.5 11.5 12.5 13.5
## [15] 14.5 15.5 16.5 17.5 18.5 19.5
##
## $xname
## [1] "normal_targeted_het_counts[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
## $breaks
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
##
## $counts
## [1] 19 2 1 0 0 0 1 1 1 0 0 1 1 1 2 2 1 2 12
##
## $density
## [1] 0.40425532 0.04255319 0.02127660 0.00000000 0.00000000 0.00000000
## [7] 0.02127660 0.02127660 0.02127660 0.00000000 0.00000000 0.02127660
## [13] 0.02127660 0.02127660 0.04255319 0.04255319 0.02127660 0.04255319
## [19] 0.25531915
##
## $mids
## [1] 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5 9.5 10.5 11.5 12.5 13.5
## [15] 14.5 15.5 16.5 17.5 18.5
##
## $xname
## [1] "normal_targeted_het_counts[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
## $breaks
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
##
## $counts
## [1] 4 0 0 2 9 5 2 5 3 3 7 3 1 1 0 2
##
## $density
## [1] 0.08510638 0.00000000 0.00000000 0.04255319 0.19148936 0.10638298
## [7] 0.04255319 0.10638298 0.06382979 0.06382979 0.14893617 0.06382979
## [13] 0.02127660 0.02127660 0.00000000 0.04255319
##
## $mids
## [1] 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5 9.5 10.5 11.5 12.5 13.5
## [15] 14.5 15.5
##
## $xname
## [1] "normal_targeted_het_counts[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
## $breaks
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
##
## $counts
## [1] 21 2 2 0 0 0 1 0 0 0 0 0 1 1 0 1 0 1 10 7
##
## $density
## [1] 0.44680851 0.04255319 0.04255319 0.00000000 0.00000000 0.00000000
## [7] 0.02127660 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## [13] 0.02127660 0.02127660 0.00000000 0.02127660 0.00000000 0.02127660
## [19] 0.21276596 0.14893617
##
## $mids
## [1] 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5 9.5 10.5 11.5 12.5 13.5
## [15] 14.5 15.5 16.5 17.5 18.5 19.5
##
## $xname
## [1] "normal_targeted_het_counts[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
## $breaks
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
##
## $counts
## [1] 13 2 2 0 0 2 3 5 10 4 0 1 1 1 2 1
##
## $density
## [1] 0.27659574 0.04255319 0.04255319 0.00000000 0.00000000 0.04255319
## [7] 0.06382979 0.10638298 0.21276596 0.08510638 0.00000000 0.02127660
## [13] 0.02127660 0.02127660 0.04255319 0.02127660
##
## $mids
## [1] 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5 9.5 10.5 11.5 12.5 13.5
## [15] 14.5 15.5
##
## $xname
## [1] "normal_targeted_het_counts[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
## $breaks
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
##
## $counts
## [1] 10 1 6 1 2 6 1 2 1 0 1 2 2 2 1 1 4 1 3
##
## $density
## [1] 0.21276596 0.02127660 0.12765957 0.02127660 0.04255319 0.12765957
## [7] 0.02127660 0.04255319 0.02127660 0.00000000 0.02127660 0.04255319
## [13] 0.04255319 0.04255319 0.02127660 0.02127660 0.08510638 0.02127660
## [19] 0.06382979
##
## $mids
## [1] 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5 9.5 10.5 11.5 12.5 13.5
## [15] 14.5 15.5 16.5 17.5 18.5
##
## $xname
## [1] "normal_targeted_het_counts[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
## $breaks
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
##
## $counts
## [1] 2 2 2 3 2 3 4 4 2 5 5 2 3 2 3 3
##
## $density
## [1] 0.04255319 0.04255319 0.04255319 0.06382979 0.04255319 0.06382979
## [7] 0.08510638 0.08510638 0.04255319 0.10638298 0.10638298 0.04255319
## [13] 0.06382979 0.04255319 0.06382979 0.06382979
##
## $mids
## [1] 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5 9.5 10.5 11.5 12.5 13.5
## [15] 14.5 15.5
##
## $xname
## [1] "normal_targeted_het_counts[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
## $breaks
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
##
## $counts
## [1] 18 8 0 2 0 0 2 1 2 0 2 0 0 2 1 0 5 0 4
##
## $density
## [1] 0.38297872 0.17021277 0.00000000 0.04255319 0.00000000 0.00000000
## [7] 0.04255319 0.02127660 0.04255319 0.00000000 0.04255319 0.00000000
## [13] 0.00000000 0.04255319 0.02127660 0.00000000 0.10638298 0.00000000
## [19] 0.08510638
##
## $mids
## [1] 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5 9.5 10.5 11.5 12.5 13.5
## [15] 14.5 15.5 16.5 17.5 18.5
##
## $xname
## [1] "normal_targeted_het_counts[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
## $breaks
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
##
## $counts
## [1] 1 1 4 5 5 4 5 6 3 8 1 2 1 1
##
## $density
## [1] 0.02127660 0.02127660 0.08510638 0.10638298 0.10638298 0.08510638
## [7] 0.10638298 0.12765957 0.06382979 0.17021277 0.02127660 0.04255319
## [13] 0.02127660 0.02127660
##
## $mids
## [1] 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5 9.5 10.5 11.5 12.5 13.5
##
## $xname
## [1] "normal_targeted_het_counts[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
## $breaks
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13
##
## $counts
## [1] 12 6 2 1 3 5 0 3 0 2 2 10 1
##
## $density
## [1] 0.25531915 0.12765957 0.04255319 0.02127660 0.06382979 0.10638298
## [7] 0.00000000 0.06382979 0.00000000 0.04255319 0.04255319 0.21276596
## [13] 0.02127660
##
## $mids
## [1] 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5 9.5 10.5 11.5 12.5
##
## $xname
## [1] "normal_targeted_het_counts[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
## $breaks
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13
##
## $counts
## [1] 5 5 4 1 2 1 1 6 10 5 4 2 1
##
## $density
## [1] 0.10638298 0.10638298 0.08510638 0.02127660 0.04255319 0.02127660
## [7] 0.02127660 0.12765957 0.21276596 0.10638298 0.08510638 0.04255319
## [13] 0.02127660
##
## $mids
## [1] 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5 9.5 10.5 11.5 12.5
##
## $xname
## [1] "normal_targeted_het_counts[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
#normal_targeted_het_counts
clin_samples <- read.table("../Gaudin_thesis_data/clin_samples_standard.txt")$V1 ##sample names for clinical files
clin_sample_paths <- paste("../Gaudin_thesis_data/standard/",clin_samples,".dat",sep="")
clin_sample_counts_merged <- merge_count_files(clin_sample_paths) ##merged SNP counts file
normal_samples <- read.table("../Gaudin_thesis_data/nova_standard_names.txt")$V1 ##sample names for normal files
normal_sample_paths <- paste("../Gaudin_thesis_data/nova_standard/",normal_samples,".dat",sep="")
normal_sample_counts_merged <- merge_count_files(normal_sample_paths) ##merged SNP counts file
saveRDS(normal_sample_counts_merged,"normal_table_merged.rds")
saveRDS(clin_sample_counts_merged,"clinical_table_merged.rds")
clin_samples <- read.table("../Gaudin_thesis_data/clin_samples_standard.txt")$V1 ##sample names for clinical files
normal_samples <- read.table("../Gaudin_thesis_data/nova_standard_names.txt")$V1 ##sample names for normal files
tumor_countsmerged <- readRDS("../Gaudin_thesis_data/clinical_table_merged.rds") ##load saved counts_merged file
normal_countsmerged <- readRDS("../Gaudin_thesis_data/normal_table_merged.rds") ##load saved counts_merged file
normal_and_clinical_sample_names <- c(as.character(normal_samples),as.character(clin_samples)) ##combine into single list
normal_tables <- format_het_table(normal_countsmerged,5,95) ##[[1]]: VAF frequency [[2]] coverage at SNP site
tumor_tables <- format_het_table(tumor_countsmerged,5,95) ##[[1]]: VAF frequency [[2]] coverage at SNP site
complete_vaf_df <- VAF_Z_Score_analysis(normal_tables, tumor_tables, 250, 25, 75, 5, 95, 7,normal_and_clinical_sample_names)
saveRDS(complete_vaf_df,"../Gaudin_thesis_data/cdf_250_25_75.rds")
complete_vaf_df <- readRDS("../Gaudin_thesis_data/cdf_250_25_75.rds")
#complete_vaf_df
gene_specific_z_scores <- gene_specific_tables(gene_table, tumor_tables, normal_tables, clin_samples, 25,75,5,95,7,250)
gene_z_df <- gene_specific_z_scores[[1]]
gene_counts_df <- gene_specific_z_scores[[2]]
#saveRDS(gene_z_df,"../Gaudin_thesis_data/clin_gene_z.rds")
#saveRDS(gene_counts_df,"../Gaudin_thesis_data/clin_gene_counts.rds")
sample_names <- read.table("../Gaudin_thesis_data/nova_clin_names.txt")$V1
sample_files <- read.table("../Gaudin_thesis_data/nova_clin_tlen_paths.txt")$V1
for(i in 1:length(sample_files)){
frag_lengths <- read.table(as.character(sample_files[i]))$V1
if(i == 1){
nt_frags_binsize_5 <- bin_fragments(frag_lengths,5,1,600,as.character(sample_names[i]))
}else{
nt_frags_binsize_5 <- merge(nt_frags_binsize_5,bin_fragments(frag_lengths,5,1,600,sample_names[i]), by=c('start','end'))
}
print(i)
}
saveRDS(nt_frags_binsize_5,"../Gaudin_thesis_data/nt_clin_frags_binsize_5.rds")
frag_table <- readRDS("../Gaudin_thesis_data/nt_clin_frags_binsize_5.rds")
frag_table$start <- paste("s",frag_table$start,"_",frag_table$end,sep="")
frag_table_starts_ends <- frag_table$start
frag_table <- frag_table[,-c(1,2)]
frag_table <- t(frag_table)
colnames(frag_table) <- frag_table_starts_ends
frag_table <- as.data.frame(frag_table)
frag_table <- frag_table[,-c(ncol(frag_table))]
frag_table$sample_names <- row.names(frag_table)
frag_table <- normalize_frag_table(frag_table)
frag_table <- frag_table[,c(c(26:31),c(35:40),c(51:75))] #bins between 125 - 155, 170 - 200, 250 - 375
frag_table_normal <- frag_table[c(1:47),]
frag_table_clin <- frag_table[c(48:nrow(frag_table)),]
frag_table_normal$sample_names = rownames(frag_table_normal)
frag_table_clin$sample_names = rownames(frag_table_clin)
frag_avg_z_scores <- calculate_frag_z_scores(frag_table_clin,frag_table_normal)
saveRDS(frag_avg_z_scores,"../Gaudin_thesis_data/clin_frag_z_scores_5.rds")
clin_mut_percent_table <- readRDS("../Gaudin_thesis_data/clin_ctDNA_est_table.rds")
clin_sample_key_table <- readRDS("../Gaudin_thesis_data/key_reference.rds") ##data containing other sample identifiers
complete_vaf_df$sample_names <- str_replace(complete_vaf_df$sample_names, "_DONOR", "-DONOR") ##adjust to fit sample names in other files
cnv_table <- read.table("../Gaudin_thesis_data/data_CNA.txt", header = TRUE) ##table showing called cnvs
####formatting for merging######
colnames(cnv_table) <- str_replace_all(colnames(cnv_table), "[.]", "-")
row.names(cnv_table) <- cnv_table$Hugo_Symbol
cnv_table <- cnv_table[,-c(1)]
cnv_table <- t(cnv_table)
cnv_table <- data.frame(cnv_table) ##convert from atomic vector
##egfr flagged mutaions from cbio
egfr_cnv <- data.frame(Tumor_Sample_Barcode = row.names(cnv_table), egfr_cn = cnv_table$EGFR)
all_data_table <- merge(clin_mut_percent_table, egfr_cnv, all = TRUE)
all_data_table$mean_AF[is.na(all_data_table$mean_AF)] <- 0
all_data_table <- merge(all_data_table, clin_sample_key_table, by.x = "Tumor_Sample_Barcode", by.y = "id_1") ##id_1 = tumor sample barcode, id_2 == sample name
all_data_table <- merge(complete_vaf_df,all_data_table,by.x = "sample_names", by.y = "id_2", all.x = TRUE)
all_data_table$mean_AF[is.na(all_data_table$mean_AF)] <- 0
frag_avg_z_scores$sample_names <- str_replace(frag_avg_z_scores$sample_names, "_DONOR","-DONOR")
all_data_table <- clin_total_table <- merge(all_data_table,frag_avg_z_scores,by = "sample_names")
all_data_table <- all_data_table[-c(which(all_data_table$pool == "tumor" & is.na(all_data_table$Tumor_Sample_Barcode))),] ##remove tumor samples (4) which don't have egfr or barcode data
##add clinical patient IDs
patient_ids <- all_data_table$Tumor_Sample_Barcode
patient_ids <-str_replace_all(patient_ids, "-T.*", "")
all_data_table$patient_ids <- patient_ids
##add egfr gene specific Z scores
egfr_z_scores <- data.frame(sample_names = gene_z_df$sample_name, egfr_z_score = gene_z_df$EGFR, egfr_het_SNP_count = gene_counts_df$EGFR)
all_data_table <- merge(all_data_table, egfr_z_scores, by.x = "sample_names", all.x = TRUE)
#all_data_table
##mark samples with same patient id that show copy number flagged in some samples but not others (-1 means flagged for CNV in other patient samples)
updated_egfr_cn <- c()
for(i in 1:nrow(all_data_table)){
if(is.na(all_data_table$egfr_cn[i])){
updated_egfr_cn <- c(updated_egfr_cn,NA)
}
else{
if(all_data_table$egfr_cn[i] == 2){
#print("hi")
updated_egfr_cn <- c(updated_egfr_cn, 2)
}
else{
if(sum(all_data_table$egfr_cn[which(all_data_table$patient_ids == all_data_table$patient_ids[i])]) > 0){
updated_egfr_cn <- c(updated_egfr_cn, -1)
}
else{
updated_egfr_cn <- c(updated_egfr_cn, 0)
}
}
}
}
all_data_table$updated_egfr_cn <- updated_egfr_cn
ggplot(all_data_table, aes(x = pool, y = VAF_Z_Score)) + geom_boxplot() + xlab("Pool") + ylab("Mean Absolute heterozygous SNP VAF Z-score")
#### Figure 4. Histograms showing distributions of mean heterozygous SNP VAF absolute Z-scores for normal pool
a <- ggplot(all_data_table, aes(x=VAF_Z_Score, fill=pool, color=pool), bins = 90) + geom_histogram(position="identity", alpha=0.5) + theme(legend.position="top") + xlab("Average SNP VAF Z-Score") + ylab("Sample Count")
b <- ggplot(all_data_table, aes(x=VAF_Z_Score, fill=pool, color=pool), bins = 90) + geom_histogram(position="identity", alpha=0.5) +
theme(legend.position="top") + xlim(0.5,2) + xlab("Average SNP VAF Z-Score") + ylab("Sample Count")
print(a)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
print(b)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 72 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_bar).
#plot_grid(a, b, labels = "AUTO")
p <- ggplot()
p <- p + geom_point(data = all_data_table[all_data_table$pool == "tumor",], aes(x=mean_AF, y=abs_frag_z_score,color=pool),alpha = 0.5)
p <- p + geom_point(data = all_data_table[all_data_table$pool == "normal",], aes(x=mean_AF, y=abs_frag_z_score,color=pool),alpha = 0.5)
p <- p + xlab("Estimated ctDNA fraction") + ylab("Mean Fragment Length Z-score")
print(p)
g <- p + xlim(0,0.2) + ylim(0.1,3)
print(g)
## Warning: Removed 99 rows containing missing values (geom_point).
print("roc_df_full")
## [1] "roc_df_full"
roc_df_full <- ROC_curve_function(all_data_table, "VAF")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 472
## [1] "Sensitivity"
## [1] 0.3326271
## [1] "AUC"
## [1] 0.6494771
print("roc_df_above_0")
## [1] "roc_df_above_0"
roc_df_above_0 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF > 0 | all_data_table$pool == "normal"),], "VAF")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 342
## [1] "Sensitivity"
## [1] 0.4181287
## [1] "AUC"
## [1] 0.7085977
print("roc_df_above_05")
## [1] "roc_df_above_05"
roc_df_above_05 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF > 0.05 | all_data_table$pool == "normal"),], "VAF")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 107
## [1] "Sensitivity"
## [1] 0.9626168
## [1] "AUC"
## [1] 0.9976138
roc_df_full$samples <- rep("All",nrow(roc_df_full))
roc_df_above_0$samples <- rep("Above 0.0 ctDNA",nrow(roc_df_above_0))
roc_df_above_05$samples <- rep("Above 0.05 ctDNA",nrow(roc_df_above_05))
roc_df <- rbind(roc_df_full,roc_df_above_0,roc_df_above_05)
ROC_curve<-ggplot(roc_df, aes(x=False_positive_rate, y=True_positive_rate, color = samples)) + geom_path() + ylim(0,1) + geom_abline(intercept = 0, slope = 1) + xlab("1 - Specificity") + ylab("Sensitivity") + labs(color = "Samples Included")
print(ROC_curve)
ggplot(all_data_table[which(all_data_table$pool == "tumor" & all_data_table$egfr_het_SNP_count > 7),], aes(x=mean_AF, y= egfr_z_score, color = as.character(updated_egfr_cn))) + geom_point() + ylab("Mean Z-Score for EGFR Heterozygous SNPs") + xlab("Estimated ctDNA Fraction") + ggtitle("EGFR Mean SNP VAF Z-Score vs Predicted ctDNA Fraction and CNV") + scale_color_discrete(name = "Predicted CNV Amplification", labels = c("Amplified in other samples from same patient", "No amplification", "Amplification"))
print("number of SNPs")
## [1] "number of SNPs"
print(length(which(all_data_table$egfr_het_SNP_count > 7)))
## [1] 401
print("number of SNPs from patients with no flagged EGFR cnv")
## [1] "number of SNPs from patients with no flagged EGFR cnv"
print(length(which(all_data_table$egfr_het_SNP_count > 7 & all_data_table$updated_egfr_cn == 0)))
## [1] 390
print("number of SNPs from samples with flagged EGFR cnv")
## [1] "number of SNPs from samples with flagged EGFR cnv"
print(length(which(all_data_table$egfr_het_SNP_count > 7 & all_data_table$updated_egfr_cn == 2)))
## [1] 9
print("number of SNPs from samples with no flagged EGFR cnv but patients with flagged egfr CN in other samples")
## [1] "number of SNPs from samples with no flagged EGFR cnv but patients with flagged egfr CN in other samples"
print(length(which(all_data_table$egfr_het_SNP_count > 7 & all_data_table$updated_egfr_cn == -1)))
## [1] 2
ggplot(all_data_table, aes(x=abs_frag_z_score, fill=pool, color=pool), bins = 90) + geom_histogram(position="identity", alpha=0.5) + theme(legend.position="top") + xlab("Mean Fragment Length Z-Score") + ylab("Sample Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(all_data_table, aes(x=abs_frag_z_score, fill=pool, color=pool), bins = 90) + geom_histogram(position="identity", alpha=0.5) +
theme(legend.position="top") + xlim(0.1,3) + xlab("Mean Fragment Length Z-Score") + ylab("Sample Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 96 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_bar).
p <- ggplot()
p <- p + geom_point(data = clin_total_table[clin_total_table$pool == "tumor",], aes(x=mean_AF, y=abs_frag_z_score,color=pool),alpha = 0.5)
p <- p + geom_point(data = clin_total_table[clin_total_table$pool == "normal",], aes(x=mean_AF, y=abs_frag_z_score,color=pool),alpha = 0.5)
p <- p + xlab("Estimated ctDNA fraction") + ylab("Mean Fragment Length Z-score")
print(p)
g <- p + xlim(0,0.2) + ylim(0.1,3)
print(g)
## Warning: Removed 100 rows containing missing values (geom_point).
print("roc_df_full")
## [1] "roc_df_full"
roc_df_full <- ROC_curve_function(all_data_table, "fragment_length")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 472
## [1] "Sensitivity"
## [1] 0.2605932
## [1] "AUC"
## [1] 0.8364587
print("roc_df_above_0")
## [1] "roc_df_above_0"
roc_df_above_0 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF > 0 | all_data_table$pool == "normal"),], "fragment_length")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 342
## [1] "Sensitivity"
## [1] 0.3187135
## [1] "AUC"
## [1] 0.866368
print("roc_df_above_05")
## [1] "roc_df_above_05"
roc_df_above_05 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF > 0.05 | all_data_table$pool == "normal"),], "fragment_length")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 107
## [1] "Sensitivity"
## [1] 0.5607477
## [1] "AUC"
## [1] 0.9407437
roc_df_full$samples <- rep("All",nrow(roc_df_full))
roc_df_above_0$samples <- rep("Above 0.0 ctDNA",nrow(roc_df_above_0))
roc_df_above_05$samples <- rep("Above 0.05 ctDNA",nrow(roc_df_above_05))
roc_df <- rbind(roc_df_full,roc_df_above_0,roc_df_above_05)
ROC_curve<-ggplot(roc_df, aes(x=False_positive_rate, y=True_positive_rate, color = samples)) + geom_path() + ylim(0,1) + geom_abline(intercept = 0, slope = 1) + xlab("1 - Specificity") + ylab("Sensitivity") + labs(color = "Samples Included")
print(ROC_curve)
print(ggplot(all_data_table[which(all_data_table$pool == "tumor"),], aes(x=abs_frag_z_score, y=VAF_Z_Score, color = mean_AF)) + geom_point() + ylab("Mean SNP VAF Z-score") + xlab("Mean Fragment Length Z-Score") + labs(color = "Estimated ctDNA Fraction"))
print("Correlation for VAF Z-score and Frag Z Score in all samples")
## [1] "Correlation for VAF Z-score and Frag Z Score in all samples"
print(cor(all_data_table$VAF_Z_Score, all_data_table$abs_frag_z_score))
## [1] 0.5649104
print("Correlation for VAF Z-score and Frag Z Score in tumor samples")
## [1] "Correlation for VAF Z-score and Frag Z Score in tumor samples"
print(cor(all_data_table[which(all_data_table$pool == "tumor"),]$VAF_Z_Score, all_data_table[which(all_data_table$pool == "tumor"),]$abs_frag_z_score))
## [1] 0.5564904
f <- ggplot() + geom_point(data = all_data_table[which(all_data_table$pool == "tumor"),], aes(x=abs_frag_z_score, y=VAF_Z_Score, color = pool)) + ylab("Mean SNP VAF Z-score") + xlab("Mean Fragment Length Z-Score") + labs(color = "Pool")
f <- f + geom_point(data = all_data_table[which(all_data_table$pool == "normal"),], aes(x=abs_frag_z_score, y=VAF_Z_Score, color = pool))
print(f)
print("roc_df_full")
## [1] "roc_df_full"
roc_df_full <- ROC_curve_function(all_data_table, "VAF_with_frag_cutoff")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 472
## [1] "Sensitivity"
## [1] 0.4258475
## [1] "AUC"
## [1] 0.6871619
print("roc_df_above_0")
## [1] "roc_df_above_0"
roc_df_above_0 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF > 0 | all_data_table$pool == "normal"),], "VAF_with_frag_cutoff")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 342
## [1] "Sensitivity"
## [1] 0.5204678
## [1] "AUC"
## [1] 0.746485
print("roc_df_above_05")
## [1] "roc_df_above_05"
roc_df_above_05 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF > 0.05 | all_data_table$pool == "normal"),], "VAF_with_frag_cutoff")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 107
## [1] "Sensitivity"
## [1] 0.9626168
## [1] "AUC"
## [1] 0.9976138
roc_df_full$samples <- rep("All",nrow(roc_df_full))
roc_df_above_0$samples <- rep("Above 0.0 ctDNA",nrow(roc_df_above_0))
roc_df_above_05$samples <- rep("Above 0.05 ctDNA",nrow(roc_df_above_05))
roc_df <- rbind(roc_df_full,roc_df_above_0,roc_df_above_05)
ROC_curve<-ggplot(roc_df, aes(x=False_positive_rate, y=True_positive_rate, color = samples)) + geom_path() + ylim(0,1) + geom_abline(intercept = 0, slope = 1) + xlab("1 - Specificity") + ylab("Sensitivity") + labs(color = "Samples Included")
print(ROC_curve)
frag_table <- readRDS("../Gaudin_thesis_data/nt_clin_frags_binsize_5.rds")
frag_table$start <- paste("s",frag_table$start,"_",frag_table$end,sep="")
frag_table_starts_ends <- frag_table$start
frag_table <- frag_table[,-c(1,2)]
frag_table <- t(frag_table)
colnames(frag_table) <- frag_table_starts_ends
frag_table <- as.data.frame(frag_table)
frag_table <- frag_table[,-c(ncol(frag_table))]
frag_table$sample_names <- row.names(frag_table)
frag_table <- normalize_frag_table(frag_table)
frag_table$sample_names <- str_replace(frag_table$sample_names, "_DONOR", "-DONOR")
normal_indices <- which(frag_table$sample_names %in% all_data_table[which(all_data_table$pool == "normal"),]$sample_names)
tumor_all_indices <- which(frag_table$sample_names %in% all_data_table[which(all_data_table$pool == "tumor"),]$sample_names)
tumor_above_0_indices <- which(frag_table$sample_names %in% all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0),]$sample_names)
tumor_above_05_indices <- which(frag_table$sample_names %in% all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0.05),]$sample_names)
normal_frag_table <- frag_table[normal_indices,-c(ncol(frag_table))]
clin_table_all <- frag_table[tumor_all_indices,-c(ncol(frag_table))]
normal_avgs <- colSums(normal_frag_table)/nrow(normal_frag_table)
clin_avgs_all <- colSums(clin_table_all)/nrow(clin_table_all)
all_avgs <- c(normal_avgs,clin_avgs_all)
clin_table_above_0 <- frag_table[tumor_above_0_indices,-c(ncol(frag_table))]
clin_avgs_above_0 <- colSums(clin_table_above_0)/nrow(clin_table_above_0)
clin_table_above_05 <- frag_table[tumor_above_05_indices,-c(ncol(frag_table))]
clin_avgs_above_05 <- colSums(clin_table_above_05)/nrow(clin_table_above_05)
all_avgs_above_0 <- c(normal_avgs,clin_avgs_above_0)
all_avgs_above_05 <- c(normal_avgs,clin_avgs_above_05)
repeated <- length(all_avgs)/2
normal_names <- rep("normal",repeated)
tumor_names <- rep("tumor",repeated)
all_names <- c(normal_names,tumor_names)
intervals <- c(seq(1,595,5),seq(1,595,5))
fragments_df_all <- data.frame(avg_coverage = all_avgs, pool = all_names, bin = intervals)
fragments_df_above_0 <- data.frame(avg_coverage = all_avgs_above_0, pool = all_names, bin = intervals)
fragments_df_above_05 <- data.frame(avg_coverage = all_avgs_above_05, pool = all_names, bin = intervals)
print(ggplot(fragments_df_all, aes(bin, avg_coverage, colour = pool)) + geom_path() + ylab("Fraction cfDNA") + xlab("Fragment Length") + ggtitle("Average Fragment Length Distributions for HiSeq Normals and Ret Tumor Samples") + xlim(100,400))
## Warning: Removed 118 rows containing missing values (geom_path).
print(ggplot(fragments_df_above_0, aes(bin, avg_coverage, colour = pool)) + geom_path() + ylab("Fraction cfDNA") + xlab("Fragment Length") + ggtitle("Average Fragment Length Distributions for HiSeq Normals and Ret Tumor Samples") + xlim(100,400))
## Warning: Removed 118 rows containing missing values (geom_path).
print(ggplot(fragments_df_above_05, aes(bin, avg_coverage, colour = pool)) + geom_path() + ylab("Fraction cfDNA") + xlab("Fragment Length") + ggtitle("Average Fragment Length Distributions for HiSeq Normals and Ret Tumor Samples") + xlim(100,400))
## Warning: Removed 118 rows containing missing values (geom_path).
tsne_table_sd_scaled <- frag_table[c(normal_indices,tumor_all_indices),-c(ncol(frag_table))]
for(i in 1:ncol(tsne_table_sd_scaled)){
min <- min(tsne_table_sd_scaled[,i])
max <- max(tsne_table_sd_scaled[,i])
max_m_min <- max - min
tsne_table_sd_scaled[,i] <- (tsne_table_sd_scaled[,i] - min)/max_m_min
}
value_vector <- rep("normal",nrow(frag_table))
value_vector[tumor_all_indices] <- "ctDNA_est_0"
value_vector[tumor_above_0_indices] <- "ctDNA_above_0"
value_vector[tumor_above_05_indices] <- "ctDNA_above_05"
value_vector <- value_vector[c(normal_indices,tumor_all_indices)]
#make colors and labels vectors
color_vector <- value_vector
color_vector[which(color_vector == "normal")] <- "2" #red
color_vector[which(color_vector == "ctDNA_est_0")] <- "3" #green
color_vector[which(color_vector == "ctDNA_above_0")] <- "4" #blue
color_vector[which(color_vector == "ctDNA_above_05")] <- "1" #black
color_vector <- as.numeric(color_vector)
label_vector <- color_vector
label_vector[] <- "*"
#make TSNE
train <- as.matrix(tsne_table_sd_scaled)
tsne <- Rtsne(train, dims = 2, perplexity=40, verbose=FALSE, max_iter = 3000)
plot(tsne$Y, t='n', main="tsne")
text(tsne$Y, labels = label_vector, col = color_vector)
##may need to adjust legend position (x, y are first two parameters)
legend(-20, -5, legend=c("normal", "ctDNA_est_0","ctDNA_above_0", "ctDNA_above_05" ),col=c("red", "green", "blue", "black"), lwd = 1, box.lty=0, box.lwd=0)
###### PCA
pca <- prcomp(t(tsne_table_sd_scaled), scale = FALSE)
pca <- as.data.frame(pca[2]$rotation)
pca$status <- value_vector
print(ggplot(pca) + geom_point(aes(PC1,PC2, fill = status, color = status)))
VAF_Z_table <- data.frame(sample_names = all_data_table$sample_names, pool = all_data_table$pool, mean_AF= all_data_table$mean_AF, VAF_Z_Score = all_data_table$VAF_Z_Score)
VAF_frag_table <- merge(VAF_Z_table, frag_table[,c(c(26:31),c(35:40),c(51:75),c(ncol(frag_table)))], by="sample_names") ##frags with greatest diff between normal and tumor
print("All normal and tumor samples")
## [1] "All normal and tumor samples"
calclulate_mean_SVM_100(VAF_frag_table)
## [1] "true_neg"
## [1] 22.63
## [1] "false_neg"
## [1] 10.44
## [1] "false_pos"
## [1] 24.37
## [1] "true_pos"
## [1] 461.56
print("All normal and tumor samples with ctDNA > 0")
## [1] "All normal and tumor samples with ctDNA > 0"
calclulate_mean_SVM_100(VAF_frag_table[which(VAF_frag_table$pool == "normal" | VAF_frag_table$mean_AF > 0),])
## [1] "true_neg"
## [1] 29.92
## [1] "false_neg"
## [1] 10.64
## [1] "false_pos"
## [1] 17.08
## [1] "true_pos"
## [1] 331.36
print("All normal and tumor samples with ctDNA > 0.05")
## [1] "All normal and tumor samples with ctDNA > 0.05"
calclulate_mean_SVM_100(VAF_frag_table[which(VAF_frag_table$pool == "normal" | VAF_frag_table$mean_AF > 0.05),])
## [1] "true_neg"
## [1] 42.58
## [1] "false_neg"
## [1] 5.33
## [1] "false_pos"
## [1] 4.42
## [1] "true_pos"
## [1] 101.67
library(ISLR)
log_vf_frag_df <- data.frame(pool = all_data_table$pool, VAF_Z_Score = all_data_table$VAF_Z_Score, abs_frag_z_score = all_data_table$abs_frag_z_score)
#log_vf_frag_df
glm.fit <- glm(pool~ VAF_Z_Score + abs_frag_z_score, data = log_vf_frag_df, family = binomial)
#summary(glm.fit)
glm.probs <- predict(glm.fit,type = "response")
#table(all_data_table$pool,glm.pred)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
e <- roc(pool ~ glm.probs, data = log_vf_frag_df, print.auc=TRUE)
## Setting levels: control = normal, case = tumor
## Setting direction: controls < cases
auc(pool ~ glm.probs, data = log_vf_frag_df, print.auc=TRUE)
## Setting levels: control = normal, case = tumor
## Setting direction: controls < cases
## Area under the curve: 0.8459
#plot(e, xlim=c(0.1, 0), ylim = c(0,1), col="blue")
ci.se(e,specificities = c(1))
## 95% CI (2000 stratified bootstrap replicates):
## sp se.low se.median se.high
## 1 0.2966 0.3538 0.5954
log_vf_frag_df <- data.frame(pool = all_data_table[which(all_data_table$pool == "normal" | all_data_table$mean_AF > 0.0),]$pool, VAF_Z_Score = all_data_table[which(all_data_table$pool == "normal" | all_data_table$mean_AF > 0.0),]$VAF_Z_Score, abs_frag_z_score = all_data_table[which(all_data_table$pool == "normal" | all_data_table$mean_AF > 0.0),]$abs_frag_z_score)
#log_vf_frag_df
glm.fit <- glm(pool~ VAF_Z_Score + abs_frag_z_score, data = log_vf_frag_df, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#summary(glm.fit)
glm.probs <- predict(glm.fit,type = "response")
#table(all_data_table$pool,glm.pred)
library(pROC)
f <- roc(pool ~ glm.probs, data = log_vf_frag_df, print.auc=TRUE)
## Setting levels: control = normal, case = tumor
## Setting direction: controls < cases
auc(pool ~ glm.probs, data = log_vf_frag_df, print.auc=TRUE)
## Setting levels: control = normal, case = tumor
## Setting direction: controls < cases
## Area under the curve: 0.8783
#plot(f, asp = NA, xlim=c(1, 0), ylim = c(0,1), col="red")
ci.se(f,specificities = c(1))
## 95% CI (2000 stratified bootstrap replicates):
## sp se.low se.median se.high
## 1 0.3743 0.4444 0.6988
log_vf_frag_df <- data.frame(pool = all_data_table[which(all_data_table$pool == "normal" | all_data_table$mean_AF > 0.05),]$pool, VAF_Z_Score = all_data_table[which(all_data_table$pool == "normal" | all_data_table$mean_AF > 0.05),]$VAF_Z_Score, abs_frag_z_score = all_data_table[which(all_data_table$pool == "normal" | all_data_table$mean_AF > 0.05),]$abs_frag_z_score)
#log_vf_frag_df
glm.fit <- glm(pool~ VAF_Z_Score + abs_frag_z_score, data = log_vf_frag_df, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#summary(glm.fit)
glm.probs <- predict(glm.fit,type = "response")
#table(all_data_table$pool,glm.pred)
library(pROC)
g <- roc(pool ~ glm.probs, data = log_vf_frag_df, print.auc=TRUE)
## Setting levels: control = normal, case = tumor
## Setting direction: controls < cases
auc(pool ~ glm.probs, data = log_vf_frag_df, print.auc=TRUE)
## Setting levels: control = normal, case = tumor
## Setting direction: controls < cases
## Area under the curve: 0.9984
plot(g, asp = NA, xlim=c(1, 0), ylim = c(0,1), col="green")
lines(e, col="red",lty=1)
lines(f, col="blue",lty=1)
legend(0.4,0.3,legend=c("all samples","ctDNA above 0","ctDNA above 0.05"), col=c("red","blue","green"), lty = 1)
ci.se(g,specificities = c(1))
## 95% CI (2000 stratified bootstrap replicates):
## sp se.low se.median se.high
## 1 0.9533 0.9813 1